{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas and Dask Tutorial"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing the libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import dask.datasets\n",
    "import pandas as pd\n",
    "from soda.scan import Scan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create Artifical Pandas and Dask Dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>timestamp</th>\n",
       "      <th>name</th>\n",
       "      <th>id</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>email</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2000-01-01 00:00:00</td>\n",
       "      <td>Kevin</td>\n",
       "      <td>993</td>\n",
       "      <td>0.961304</td>\n",
       "      <td>-0.794708</td>\n",
       "      <td>a@soda.io</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2000-01-01 00:00:01</td>\n",
       "      <td>Kevin</td>\n",
       "      <td>1024</td>\n",
       "      <td>0.458458</td>\n",
       "      <td>0.976622</td>\n",
       "      <td>a@soda.io</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2000-01-01 00:00:02</td>\n",
       "      <td>Michael</td>\n",
       "      <td>1028</td>\n",
       "      <td>0.008827</td>\n",
       "      <td>0.327489</td>\n",
       "      <td>a@soda.io</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2000-01-01 00:00:03</td>\n",
       "      <td>Sarah</td>\n",
       "      <td>977</td>\n",
       "      <td>-0.025025</td>\n",
       "      <td>0.848088</td>\n",
       "      <td>a@soda.io</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2000-01-01 00:00:04</td>\n",
       "      <td>Charlie</td>\n",
       "      <td>958</td>\n",
       "      <td>-0.955403</td>\n",
       "      <td>-0.577273</td>\n",
       "      <td>a@soda.io</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            timestamp     name    id         x         y      email\n",
       "0 2000-01-01 00:00:00    Kevin   993  0.961304 -0.794708  a@soda.io\n",
       "1 2000-01-01 00:00:01    Kevin  1024  0.458458  0.976622  a@soda.io\n",
       "2 2000-01-01 00:00:02  Michael  1028  0.008827  0.327489  a@soda.io\n",
       "3 2000-01-01 00:00:03    Sarah   977 -0.025025  0.848088  a@soda.io\n",
       "4 2000-01-01 00:00:04  Charlie   958 -0.955403 -0.577273  a@soda.io"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load timeseries data from dask datasets\n",
    "df_timeseries = dask.datasets.timeseries().reset_index()\n",
    "df_timeseries[\"email\"] = \"a@soda.io\"\n",
    "\n",
    "df_timeseries.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>email</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Bastien</td>\n",
       "      <td>a@soda.io</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Titus</td>\n",
       "      <td>b@soda.io</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Baturay</td>\n",
       "      <td>c@soda.io</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      name      email\n",
       "0  Bastien  a@soda.io\n",
       "1    Titus  b@soda.io\n",
       "2  Baturay  c@soda.io"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create an artificial pandas dataframe\n",
    "df_employee = pd.DataFrame(\n",
    "    {\n",
    "        \"name\": [\"Bastien\", \"Titus\", \"Baturay\"],\n",
    "        \"email\": [\"a@soda.io\", \"b@soda.io\", \"c@soda.io\"],\n",
    "    }\n",
    ")\n",
    "\n",
    "df_employee.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a soda scan object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "scan = Scan()\n",
    "scan.set_scan_definition_name(\"dask and pandas tutorial\")\n",
    "scan.set_data_source_name(\"dask\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Add dataframes to the soda scan object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add dask dataframe to scan and assign a dataset name to refer from checks yaml\n",
    "scan.add_dask_dataframe(dataset_name=\"timeseries\", dask_df=df_timeseries)\n",
    "\n",
    "# Add pandas dataframe to scan and assign a dataset name to refer from checks yaml\n",
    "scan.add_pandas_dataframe(dataset_name=\"employee\", pandas_df=df_employee)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define checks in yaml format"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the first example, we will check row counts of the two dataframes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:soda.scan:Instantiating for each for ['timeseries', 'employee']\n",
      "INFO:soda.scan:[12:09:04] Scan summary:\n",
      "INFO:soda.scan:[12:09:04] 2/2 checks PASSED: \n",
      "INFO:soda.scan:[12:09:04]     timeseries in dask\n",
      "INFO:soda.scan:[12:09:04]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:04]     employee in dask\n",
      "INFO:soda.scan:[12:09:04]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:04] All is good. No failures. No warnings. No errors.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO   | Soda Core 3.0.46\n",
      "INFO   | Scan summary:\n",
      "INFO   | 2/2 checks PASSED: \n",
      "INFO   |     timeseries in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |     employee in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   | All is good. No failures. No warnings. No errors.\n"
     ]
    }
   ],
   "source": [
    "# Define checks in yaml format\n",
    "# alternatively you can refer to a yaml file using scan.add_sodacl_yaml_file(<filepath>)\n",
    "row_count_checks = \"\"\"\n",
    "for each dataset T:\n",
    "  datasets:\n",
    "    - include %\n",
    "  checks:\n",
    "    - row_count > 0\n",
    "\"\"\"\n",
    "scan.add_sodacl_yaml_str(row_count_checks)\n",
    "scan.execute()\n",
    "print(scan.get_logs_text())\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we will apply a cross check between pandas and dask dataframes. We will check if the values of `employee.email` exist in `timeseries.email` dataframe. It is expected that the check will fail because `b@soda.io` and `c@soda.io` are not present in `timeseries.email` dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:soda.scan:Instantiating for each for ['timeseries', 'employee', 'showtables']\n",
      "INFO:soda.scan:[12:09:11] Using DefaultSampler\n",
      "INFO:soda.scan:[12:09:11] Scan summary:\n",
      "INFO:soda.scan:[12:09:11] 7/9 checks PASSED: \n",
      "INFO:soda.scan:[12:09:11]     timeseries in dask\n",
      "INFO:soda.scan:[12:09:11]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:11]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:11]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:11]     employee in dask\n",
      "INFO:soda.scan:[12:09:11]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:11]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:11]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:11]     showtables in dask\n",
      "INFO:soda.scan:[12:09:11]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:11] 2/9 checks FAILED: \n",
      "INFO:soda.scan:[12:09:11]     employee in dask\n",
      "INFO:soda.scan:[12:09:11]       values in (email) must exist in timeseries (email) [FAILED]\n",
      "INFO:soda.scan:[12:09:11]         value: 2\n",
      "INFO:soda.scan:[12:09:11]       row_count same as timeseries [FAILED]\n",
      "INFO:soda.scan:[12:09:11]         value: -2591997\n",
      "INFO:soda.scan:[12:09:11]         rowCount: 3\n",
      "INFO:soda.scan:[12:09:11]         otherRowCount: 2592000\n",
      "INFO:soda.scan:[12:09:11] Oops! 2 failures. 0 warnings. 0 errors. 7 pass.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO   | Soda Core 3.0.46\n",
      "INFO   | Scan summary:\n",
      "INFO   | 2/2 checks PASSED: \n",
      "INFO   |     timeseries in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |     employee in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   | All is good. No failures. No warnings. No errors.\n",
      "INFO   | Using DefaultSampler\n",
      "INFO   | Scan summary:\n",
      "INFO   | 7/9 checks PASSED: \n",
      "INFO   |     timeseries in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |     employee in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |     showtables in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   | 2/9 checks FAILED: \n",
      "INFO   |     employee in dask\n",
      "INFO   |       values in (email) must exist in timeseries (email) [FAILED]\n",
      "INFO   |         value: 2\n",
      "INFO   |       row_count same as timeseries [FAILED]\n",
      "INFO   |         value: -2591997\n",
      "INFO   |         rowCount: 3\n",
      "INFO   |         otherRowCount: 2592000\n",
      "INFO   | Oops! 2 failures. 0 warnings. 0 errors. 7 pass.\n"
     ]
    }
   ],
   "source": [
    "cross_table_checks = \"\"\"\n",
    "checks for employee:\n",
    "    - values in (email) must exist in timeseries (email) # Error expected\n",
    "    - row_count same as timeseries # Error expected\n",
    "\"\"\"\n",
    "scan.add_sodacl_yaml_str(cross_table_checks)\n",
    "scan.execute()\n",
    "print(scan.get_logs_text())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add some custom checks for timeseries data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:soda.scan:Instantiating for each for ['timeseries', 'employee', 'showtables']\n",
      "INFO:soda.scan:[12:09:37] Using DefaultSampler\n",
      "INFO:soda.scan:[12:09:43] Using DefaultSampler\n",
      "INFO:soda.scan:[12:09:45] Scan summary:\n",
      "INFO:soda.scan:[12:09:45] 23/30 checks PASSED: \n",
      "INFO:soda.scan:[12:09:45]     timeseries in dask\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       values in (email) must exist in employee (email) [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       invalid_count(email) = 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       valid_count(email) > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       missing_count(y) = 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       missing_percent(x) < 5% [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       avg(x) between -1 and 1 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       max(x) > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       min(x) < 1 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]     employee in dask\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]     showtables in dask\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45]       row_count > 0 [PASSED]\n",
      "INFO:soda.scan:[12:09:45] 1/30 checks WARNED: \n",
      "INFO:soda.scan:[12:09:45]     timeseries in dask\n",
      "INFO:soda.scan:[12:09:45]       missing_count(y) warn when > -1 [WARNED]\n",
      "INFO:soda.scan:[12:09:45]         check_value: 0\n",
      "INFO:soda.scan:[12:09:45] 6/30 checks FAILED: \n",
      "INFO:soda.scan:[12:09:45]     employee in dask\n",
      "INFO:soda.scan:[12:09:45]       values in (email) must exist in timeseries (email) [FAILED]\n",
      "INFO:soda.scan:[12:09:45]         value: 2\n",
      "INFO:soda.scan:[12:09:45]       row_count same as timeseries [FAILED]\n",
      "INFO:soda.scan:[12:09:45]         value: -2591997\n",
      "INFO:soda.scan:[12:09:45]         rowCount: 3\n",
      "INFO:soda.scan:[12:09:45]         otherRowCount: 2592000\n",
      "INFO:soda.scan:[12:09:45]       values in (email) must exist in timeseries (email) [FAILED]\n",
      "INFO:soda.scan:[12:09:45]         value: 2\n",
      "INFO:soda.scan:[12:09:45]       row_count same as timeseries [FAILED]\n",
      "INFO:soda.scan:[12:09:45]         value: -2591997\n",
      "INFO:soda.scan:[12:09:45]         rowCount: 3\n",
      "INFO:soda.scan:[12:09:45]         otherRowCount: 2592000\n",
      "INFO:soda.scan:[12:09:45]     timeseries in dask\n",
      "INFO:soda.scan:[12:09:45]       duplicate_count(name) < 4 [FAILED]\n",
      "INFO:soda.scan:[12:09:45]         check_value: 26\n",
      "INFO:soda.scan:[12:09:45]       freshness(timestamp) < 1d [FAILED]\n",
      "INFO:soda.scan:[12:09:45]         max_column_timestamp: 2000-01-30 23:59:59\n",
      "INFO:soda.scan:[12:09:45]         max_column_timestamp_utc: 2000-01-30 23:59:59+00:00\n",
      "INFO:soda.scan:[12:09:45]         now_variable_name: NOW\n",
      "INFO:soda.scan:[12:09:45]         now_timestamp: 2023-07-20T10:09:03.468821+00:00\n",
      "INFO:soda.scan:[12:09:45]         now_timestamp_utc: 2023-07-20 10:09:03.468821+00:00\n",
      "INFO:soda.scan:[12:09:45]         freshness: 8571 days, 10:09:04.468821\n",
      "INFO:soda.scan:[12:09:45] Oops! 6 failure. 1 warning. 0 errors. 23 pass.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO   | Soda Core 3.0.46\n",
      "INFO   | Scan summary:\n",
      "INFO   | 2/2 checks PASSED: \n",
      "INFO   |     timeseries in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |     employee in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   | All is good. No failures. No warnings. No errors.\n",
      "INFO   | Using DefaultSampler\n",
      "INFO   | Scan summary:\n",
      "INFO   | 7/9 checks PASSED: \n",
      "INFO   |     timeseries in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |     employee in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |     showtables in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   | 2/9 checks FAILED: \n",
      "INFO   |     employee in dask\n",
      "INFO   |       values in (email) must exist in timeseries (email) [FAILED]\n",
      "INFO   |         value: 2\n",
      "INFO   |       row_count same as timeseries [FAILED]\n",
      "INFO   |         value: -2591997\n",
      "INFO   |         rowCount: 3\n",
      "INFO   |         otherRowCount: 2592000\n",
      "INFO   | Oops! 2 failures. 0 warnings. 0 errors. 7 pass.\n",
      "INFO   | Using DefaultSampler\n",
      "INFO   | Using DefaultSampler\n",
      "INFO   | Scan summary:\n",
      "INFO   | 23/30 checks PASSED: \n",
      "INFO   |     timeseries in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       values in (email) must exist in employee (email) [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       invalid_count(email) = 0 [PASSED]\n",
      "INFO   |       valid_count(email) > 0 [PASSED]\n",
      "INFO   |       missing_count(y) = 0 [PASSED]\n",
      "INFO   |       missing_percent(x) < 5% [PASSED]\n",
      "INFO   |       avg(x) between -1 and 1 [PASSED]\n",
      "INFO   |       max(x) > 0 [PASSED]\n",
      "INFO   |       min(x) < 1 [PASSED]\n",
      "INFO   |     employee in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |     showtables in dask\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   |       row_count > 0 [PASSED]\n",
      "INFO   | 1/30 checks WARNED: \n",
      "INFO   |     timeseries in dask\n",
      "INFO   |       missing_count(y) warn when > -1 [WARNED]\n",
      "INFO   |         check_value: 0\n",
      "INFO   | 6/30 checks FAILED: \n",
      "INFO   |     employee in dask\n",
      "INFO   |       values in (email) must exist in timeseries (email) [FAILED]\n",
      "INFO   |         value: 2\n",
      "INFO   |       row_count same as timeseries [FAILED]\n",
      "INFO   |         value: -2591997\n",
      "INFO   |         rowCount: 3\n",
      "INFO   |         otherRowCount: 2592000\n",
      "INFO   |       values in (email) must exist in timeseries (email) [FAILED]\n",
      "INFO   |         value: 2\n",
      "INFO   |       row_count same as timeseries [FAILED]\n",
      "INFO   |         value: -2591997\n",
      "INFO   |         rowCount: 3\n",
      "INFO   |         otherRowCount: 2592000\n",
      "INFO   |     timeseries in dask\n",
      "INFO   |       duplicate_count(name) < 4 [FAILED]\n",
      "INFO   |         check_value: 26\n",
      "INFO   |       freshness(timestamp) < 1d [FAILED]\n",
      "INFO   |         max_column_timestamp: 2000-01-30 23:59:59\n",
      "INFO   |         max_column_timestamp_utc: 2000-01-30 23:59:59+00:00\n",
      "INFO   |         now_variable_name: NOW\n",
      "INFO   |         now_timestamp: 2023-07-20T10:09:03.468821+00:00\n",
      "INFO   |         now_timestamp_utc: 2023-07-20 10:09:03.468821+00:00\n",
      "INFO   |         freshness: 8571 days, 10:09:04.468821\n",
      "INFO   | Oops! 6 failure. 1 warning. 0 errors. 23 pass.\n"
     ]
    }
   ],
   "source": [
    "timeseries_checks = \"\"\"\n",
    "checks for timeseries:\n",
    "  - invalid_count(email) = 0:\n",
    "      valid format: email\n",
    "  - valid_count(email) > 0:\n",
    "      valid format: email\n",
    "  - duplicate_count(name) < 4:\n",
    "      samples limit: 2\n",
    "  - missing_count(y):\n",
    "      warn: when > -1\n",
    "  - missing_percent(x) < 5%\n",
    "  - missing_count(y) = 0\n",
    "  - avg(x) between -1 and 1\n",
    "  - max(x) > 0\n",
    "  - min(x) < 1:\n",
    "      filter: x > 0.2\n",
    "  - freshness(timestamp) < 1d\n",
    "  - values in (email) must exist in employee (email)\n",
    "\"\"\"\n",
    "scan.add_sodacl_yaml_str(timeseries_checks)\n",
    "scan.execute()\n",
    "print(scan.get_logs_text())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "fe7e78ee510bd5cd63cfcd97fd749c15efa3286469d966e90f0f6c3eeb85a4f8"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
